{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### kmeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.cluster import KMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 2],\n",
       "       [0, 0],\n",
       "       [1, 0],\n",
       "       [5, 0],\n",
       "       [5, 2]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 1 1 0 0]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "kmeans = KMeans(n_clusters=2, random_state=0).fit(X)\n",
    "print(kmeans.labels_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmeans.predict([[0, 0], [12, 3]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[5.        , 1.        ],\n",
       "       [0.33333333, 0.66666667]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmeans.cluster_centers_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5.0\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "# 定义欧式距离\n",
    "def euclidean_distance(x1, x2):\n",
    "    distance = 0\n",
    "    # 距离的平方项再开根号\n",
    "    for i in range(len(x1)):\n",
    "        distance += pow((x1[i] - x2[i]), 2)\n",
    "    return np.sqrt(distance)\n",
    "\n",
    "print(euclidean_distance(X[0], X[4]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义中心初始化函数\n",
    "def centroids_init(k, X):\n",
    "    m, n = X.shape\n",
    "    centroids = np.zeros((k, n))\n",
    "    for i in range(k):\n",
    "        # 每一次循环随机选择一个类别中心\n",
    "        centroid = X[np.random.choice(range(m))]\n",
    "        centroids[i] = centroid\n",
    "    return centroids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义样本的最近质心点所属的类别索引\n",
    "def closest_centroid(sample, centroids):\n",
    "    closest_i = 0\n",
    "    closest_dist = float('inf')\n",
    "    for i, centroid in enumerate(centroids):\n",
    "        # 根据欧式距离判断，选择最小距离的中心点所属类别\n",
    "        distance = euclidean_distance(sample, centroid)\n",
    "        if distance < closest_dist:\n",
    "            closest_i = i\n",
    "            closest_dist = distance\n",
    "    return closest_i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义构建类别过程\n",
    "def build_clusters(centroids, k, X):\n",
    "    clusters = [[] for _ in range(k)]\n",
    "    for x_i, x in enumerate(X):\n",
    "        # 将样本划分到最近的类别区域\n",
    "        centroid_i = closest_centroid(x, centroids)\n",
    "        clusters[centroid_i].append(x_i)\n",
    "    return clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 根据上一步聚类结果计算新的中心点\n",
    "def calculate_centroids(clusters, k, X):\n",
    "    n = X.shape[1]\n",
    "    centroids = np.zeros((k, n))\n",
    "    # 以当前每个类样本的均值为新的中心点\n",
    "    for i, cluster in enumerate(clusters):\n",
    "        centroid = np.mean(X[cluster], axis=0)\n",
    "        centroids[i] = centroid\n",
    "    return centroids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取每个样本所属的聚类类别\n",
    "def get_cluster_labels(clusters, X):\n",
    "    y_pred = np.zeros(X.shape[0])\n",
    "    for cluster_i, cluster in enumerate(clusters):\n",
    "        for X_i in cluster:\n",
    "            y_pred[X_i] = cluster_i\n",
    "    return y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 根据上述各流程定义kmeans算法流程\n",
    "def kmeans(X, k, max_iterations):\n",
    "    # 1.初始化中心点\n",
    "    centroids = centroids_init(k, X)\n",
    "    # 遍历迭代求解\n",
    "    for _ in range(max_iterations):\n",
    "        # 2.根据当前中心点进行聚类\n",
    "        clusters = build_clusters(centroids, k, X)\n",
    "        # 保存当前中心点\n",
    "        prev_centroids = centroids\n",
    "        # 3.根据聚类结果计算新的中心点\n",
    "        centroids = calculate_centroids(clusters, k, X)\n",
    "        # 4.设定收敛条件为中心点是否发生变化\n",
    "        diff = centroids - prev_centroids\n",
    "        if not diff.any():\n",
    "            break\n",
    "    # 返回最终的聚类标签\n",
    "    return get_cluster_labels(clusters, X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0. 0. 0. 1. 1.]\n"
     ]
    }
   ],
   "source": [
    "# 测试数据\n",
    "X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])\n",
    "# 设定聚类类别为2个，最大迭代次数为10次\n",
    "labels = kmeans(X, 2, 10)\n",
    "# 打印每个样本所属的类别标签\n",
    "print(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
